home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
ftp.mactech.com 2010
/
ftp.mactech.com.tar
/
ftp.mactech.com
/
machack
/
Hacks97
/
NewsTicker.sit
/
NewsTicker
/
source code
/
Extractors
/
HTMLExtractor.cp
< prev
next >
Wrap
Text File
|
1997-06-27
|
12KB
|
599 lines
/*------------------------------------------------------------------------------
#
# NewsTicker, my Hack for 1997
#
# HTMLExtractor.cp - Base class to read an HTML page in, and parse
# out the interesting stuff. Useless on its own,
# only exists to be derived..
#
------------------------------------------------------------------------------*/
#include <Threads.h>
#include <strings.h>
#include "HTMLExtractor.h"
#include "SubWooferEndPoint.h"
#include "HTTPEndPoint.h"
#include "Idler.h"
#include "TickerGlobals.h" //get our structures and all
#include "TickerWindowHandler.h"
#include "BeachBall.h"
#include <string.h>
BeachBall* gTheBall = nil;
class TickerIdler : public Idler
{
private:
long mlWNEDelay;
HTMLExtractor* mftheExtractor;
protected:
TickerIdler (const TickerIdler& oRHS);
TickerIdler& operator= (const TickerIdler& oRHS);
public:
TickerIdler (HTMLExtractor* theExtractor);
virtual void YieldAction (void);
virtual ~TickerIdler (void) { }
};
//
// The tickler below does many things while waiting for data to send/receive from
// the Internet. It spins a beachball, and recognizes as we come to/from the background,
// and it scrolls our window bellow us. It also recognizes command-period or closing the
// window to abort a read/write
//
#define kDefaultWNEDelay 15
#define kDefaultIdlerPeriod 15
TickerIdler::TickerIdler (HTMLExtractor* theExtractor) //default constructor. Tell the Idler base class how often to call yield action
{
SetPeriod (kDefaultWNEDelay);
mlWNEDelay = kDefaultIdlerPeriod;
mftheExtractor = theExtractor;
}
void TickerIdler::YieldAction (void)
{
EventRecord sEvent;
JustHandleWindow();
#ifdef USESUBWOOFER
YieldToAnyThread();
#endif
if ((gTheBall!=nil)&&(!gInBackground))
gTheBall->Idle(); //spin our beach ball cursor
if (!gDoneFlag)
{
if (mftheExtractor)
mftheExtractor->Cancel();
}
if (WaitNextEvent(everyEvent , &sEvent, mlWNEDelay, nil))
{
switch (sEvent.what)
{
case kHighLevelEvent:
AEProcessAppleEvent( &sEvent ) ;
break;
case keyDown:
if (((sEvent.message & charCodeMask)=='.')&&(sEvent.modifiers & cmdKey))
{
if (mftheExtractor)
mftheExtractor->Cancel();
}
break;
case osEvt:
if (((sEvent.message >> 24) & 0x0FF) == kSuspendResumeMessage) /* high byte of message */
{
gInBackground = (sEvent.message & kResumeMask) == 0;
}
}
}
}
HTMLExtractor::HTMLExtractor (char* theaddress, short theIconID, sMyDataPtr theDataPtr)
{
#ifdef USESUBWOOFER
mfWebPipe = nil;
#else
mfHTTPPipe = nil;
#endif
mfDoingARead = false;
mfTheDataPtr = theDataPtr;
mfLastModified[0] = 0;
mfIconID = theIconID;
strcpy(mfAddress, theaddress);
}
HTMLExtractor::~HTMLExtractor (void)
{
#ifdef USESUBWOOFER
if (mfWebPipe)
{
delete mfWebPipe; mfWebPipe = nil;
}
#else
if (mfHTTPPipe)
{
delete mfHTTPPipe; mfHTTPPipe = nil;
}
#endif
}
void HTMLExtractor::AddEntry(Str255 theSubject, Str255 theURL)
{
if (gThePrefs.JustShowFirstThree&&(mfTempHeadlineCount>=3)) //demo mode
{
mfDoingARead = false;
return;
}
if (mfTempHeadlineCount<tempmaxHeadlines)
{
PLstrcpy(mfTempHeadlines[mfTempHeadlineCount].Subject, theSubject);
PLstrcpy(mfTempHeadlines[mfTempHeadlineCount].URL, theURL);
mfTempHeadlines[mfTempHeadlineCount].cicnResID = mfIconID;
mfTempHeadlineCount++;
}
}
// Called by base app to read all entries in, or check header and see if it's changed
void HTMLExtractor::ReadEntries (void)
{
short index;
short destindex;
TickerIdler* theidler = new TickerIdler(this);
Ptr thebuffer;
long buffersize;
OSErr io;
mfDoingARead = true;
mfReadingHeader = true;
thetextsize = 0;
thetagsize = 0;
AmOnTag = false;
mfTempHeadlineCount = 0;
if (!gTheBall)
gTheBall = new BeachBall();
// Use the subwoofer code
#ifdef USESUBWOOFER
mfReadingHeader = false; //we don't get headers from Subwoofer
if (mfWebPipe)
{
delete mfWebPipe; mfWebPipe = nil;
}
mfWebPipe = new SubWooferEndPoint(this);
if (mfWebPipe->StartGettingFile(mfAddress, 80, theidler)!=noErr)
{
delete mfWebPipe; mfWebPipe = nil;
delete theidler;
return;
}
do
{
mfWebPipe->DoIdle();
theidler->YieldAction();
}
while (mfDoingARead);
io = mfWebPipe->GetSubWoofHeader(mfLastModified);
delete mfWebPipe;
mfWebPipe = nil;
#else
//
// Use the raw OT stuff
if (mfHTTPPipe)
{
delete mfHTTPPipe; mfHTTPPipe = nil;
}
mfHTTPPipe = new HTTPEndPoint(this);
if (mfHTTPPipe->StartGettingFile(mfAddress, 80, theidler)!=noErr)
{
delete mfHTTPPipe; mfHTTPPipe = nil;
delete theidler;
return;
}
do
{
mfHTTPPipe->DoIdle();
theidler->YieldAction();
}
while (mfDoingARead);
delete mfHTTPPipe;
mfHTTPPipe = nil;
#endif
delete theidler;
// Delete all entries with cicnResID = mfIconID
destindex = 0;
for (index = 0; index < mfTheDataPtr->MsgCount; index++)
{
if (mfTheDataPtr->theHeadlines[index].cicnResID!=mfIconID) //don't delete it
{
if (index!=destindex) //copy down if we need to
{
mfTheDataPtr->theHeadlines[destindex] = mfTheDataPtr->theHeadlines[index];
}
destindex++;
}
}
mfTheDataPtr->MsgCount = destindex;
// Now copy the entries we accumulated out
for (index = 0; index<mfTempHeadlineCount; index++) // copy the entries off
{
if (mfTheDataPtr->MsgCount<maxHeadlines)
{
mfTheDataPtr->theHeadlines[mfTheDataPtr->MsgCount] = mfTempHeadlines[index];
mfTheDataPtr->MsgCount++;
}
}
}
// Called by endpoint as it gets strings
void HTMLExtractor::ReceiveString (char* string, short numchars)
{
short index;
char thechar;
if (mfReadingHeader)
{
if (numchars <= 2) //must be crlf
mfReadingHeader = false;
else
{
//if Last-modifed line, save it
if (MyCompareStr(string, "Last-Modified:"))
{
if (numchars>31)
numchars = 31;
mfLastModified[0] = numchars;
BlockMove(string, &mfLastModified[1], numchars);
}
}
}
else
{
for (index = 0; index<numchars; index++)
{
thechar = string[index];
if ((thechar==0x0d)||(thechar==0x0a)||(thechar==0x09))//make carriage returns and line feeds spaces
thechar = ' ';
if (AmOnTag)
{
if ((thetagsize<2047)&&((thetagsize>0)||(thechar!=' '))) //add this character to the tag
{
thetag[thetagsize] = thechar; thetagsize++;
}
if (thechar=='>') //end of tag?
{
thetag[thetagsize] = 0; //make it a nice C string
HandleToken(thetag, thetagsize, true); //and handle it
thetextsize = 0; //And star getting text
AmOnTag = false;
}
}
else
{
if (thechar=='<') //start of tag?
{
if (thetextsize>0) //any text to handle?
{
thetext[thetextsize] = 0;
HandleToken(thetext, thetextsize, false); //handle the text
}
thetag[0] = thechar; //put this in the tag and start parsing it
thetagsize = 1;
AmOnTag = true;
}
else //nope, just add to the text
{
if ((thetextsize<2047)&&((thetextsize>0)||(thechar!=' ')))
{
thetext[thetextsize] = thechar; thetextsize++;
}
}
}
}
}
}
void HTMLExtractor::HandleToken(char* string, short numchars, Boolean isCommand)
{
}
void HTMLExtractor::Disconnect(void)
{
mfDoingARead = false;
}
// Cancel the connection
//
void HTMLExtractor::Cancel(void)
{
mfDoingARead = false;
}
// Called by base app to read the header in
void HTMLExtractor::ReadLastModified(void)
{
TickerIdler* theidler = new TickerIdler(this);
Ptr thebuffer;
long buffersize;
OSErr io;
mfDoingARead = true;
mfReadingHeader = true;
thetextsize = 0;
thetagsize = 0;
AmOnTag = false;
mfTempHeadlineCount = 0;
if (!gTheBall)
gTheBall = new BeachBall();
// Use the subwoofer code
#ifdef USESUBWOOFER
if (mfWebPipe)
{
delete mfWebPipe; mfWebPipe = nil;
}
mfWebPipe = new SubWooferEndPoint(this);
if (mfWebPipe->StartGettingHeader(mfAddress, 80, theidler)!=noErr)
{
delete mfWebPipe; mfWebPipe = nil;
delete theidler;
return;
}
do
{
mfWebPipe->DoIdle();
theidler->YieldAction();
}
while (mfDoingARead);
io = mfWebPipe->GetSubWoofHeader(mfLastModified);
delete mfWebPipe;
mfWebPipe = nil;
#else
//
// Use the raw OT stuff
if (mfHTTPPipe)
{
delete mfHTTPPipe; mfHTTPPipe = nil;
}
mfHTTPPipe = new HTTPEndPoint(this);
if (mfHTTPPipe->StartGettingHeader(mfAddress, 80, theidler)!=noErr)
{
delete mfHTTPPipe; mfHTTPPipe = nil;
delete theidler;
return;
}
do
{
mfHTTPPipe->DoIdle();
theidler->YieldAction();
}
while (mfDoingARead);
delete mfHTTPPipe;
mfHTTPPipe = nil;
#endif
delete theidler;
}
void HTMLExtractor::GetLastModified (Str31 LastModStr)
{
PLstrcpy(LastModStr, mfLastModified);
}
//
// Here is some standard code to help parse the HTML
//
static char* SkipWhiteChars(char* pcSrc)
{
while ((*pcSrc != 0) && ((*pcSrc== ' ') || (*pcSrc == '\r') || (*pcSrc == '\n')))
pcSrc++;
return pcSrc;
}
static char* SkipWhiteCharsAndEqual(char* pcSrc)
{
pcSrc = SkipWhiteChars(pcSrc);
if (*pcSrc == '=')
pcSrc++;
pcSrc = SkipWhiteChars(pcSrc);
return pcSrc;
}
Boolean MyCompareStr(char* p1, char* p2)
{
short thelength = strlen(p2);
return (IdenticalText (p1, p2, thelength, thelength, nil)==0);
}
//
// Look for some quoted data for a given marker
//
void FindATag(char* tag, char* theLink, char* theMarker)
{
char* cp;
cp = theLink;
*cp = 0;
do
{
if (*tag != ' ')
return;
tag++;
//DebugStr("\pPreparing to get the tag");
tag = ::SkipWhiteChars(tag);
if (::MyCompareStr(tag, theMarker))
{
tag += sizeof(theMarker);
tag = ::SkipWhiteCharsAndEqual(tag);
if (*tag != '"')
return;
tag++;
if (*tag == '#') //A navigation on same page link
return;
while ((*tag != 0) && (*tag != '"'))
{
if (*tag=='?') //restart, this was apple funkiness
{
cp = theLink;
tag++;
}
else if (*tag=='$') //another part of funkiness, this isn't a good link
{
*theLink = 0;
return;
}
else *(cp++) = *(tag++);
}
*cp = 0; //mark the end
return;
}
else
{
do //Skip this item. Get past the marker
{
tag++;
}
while ((*tag!=0) && (*tag!='='));
tag++; //skip the =
if (*tag=='"')
{
tag++;
do //Skip the quoted data
{
tag++;
}
while ((*tag!=0) && (*tag!='"'));
}
do //Skip the data, waiting for a space
{
tag++;
}
while ((*tag!=0) && (*tag!=' ') && (*tag!='>'));
}
}
while ((*tag!='>')&&(*tag!=0));
}
static void SaveHRef(char* tag, char* HTMLLink)
{
char* cp;
//See if it's A HREF="
cp = HTMLLink;
*cp = 0;
if (*tag != ' ')
return;
tag++;
tag = ::SkipWhiteChars(tag);
if (!::MyCompareStr(tag, "HREF"))
return;
tag += 4;
tag = ::SkipWhiteCharsAndEqual(tag);
if (*tag != '"')
return;
tag++;
if (*tag == '#') //A navigation on same page link
return;
while ((*tag != 0) && (*tag != '"'))
{
if (*tag=='?') //restart, this was apple funkiness
{
cp = HTMLLink;
tag++;
}
else *(cp++) = *(tag++);
}
*cp = 0; //mark the end
}
static Boolean isFullURL(char* theURL)
{
for ( ; *theURL != 0; theURL++)
if (*theURL == ':')
return true;
return false;
}
Boolean HTMLExtractor::ParseGoodURL(char* thestring, Str255 theURL)
{
char HTMLLink[256];
char headerstr[10] = "http://";
short index;
theURL[0] = 0;
//SaveHRef(thestring, HTMLLink);
FindATag(thestring, HTMLLink, "HREF");
if ((HTMLLink[0]==0)||(HTMLLink[0] == '#'))
return false;
if (!isFullURL(HTMLLink))
{
for (index = 0; headerstr[index]!=0; index++) //http://
{
theURL[0]++;theURL[theURL[0]] = headerstr[index];
}
for (index = 0; mfAddress[index]!=0; index++) //add our address to it
{
theURL[0]++;theURL[theURL[0]] = mfAddress[index];
}
theURL[0]++;theURL[theURL[0]] = '/';
}
index = 0;
if (HTMLLink[0] == '/')
index++;
for ( ; HTMLLink[index] != 0; index++)
{
theURL[0]++;theURL[theURL[0]] = HTMLLink[index];
}
return true;
}